import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from datetime import datetime as dt2
from datetime import timedelta as td
from datetime import time as tm
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import plotly.offline as pyoff
import plotly.graph_objs as go
import feature_engine
from feature_engine.outliers import Winsorizer
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from sklearn.metrics import davies_bouldin_score
import warnings
warnings.filterwarnings("ignore")
df = pd.read_excel('Online Retail.xlsx')
#Check Dataset
df.head()
df.info()
df.describe()
df.nunique()
df.isnull().sum()
There are roughly 25% missing CustomerID. We will be excluding these records from our analysis
#Country
df.Country.value_counts(normalize=True)
We can see that there is maximum proportion (More than 90% of customers) of 'United Kingdom' customers. So we will take 'United Kingdom' customer for our analysis.
df = df[df.Country == 'United Kingdom']
Removing the negative values from Quantity and UnitPrice
#Quantity
df = df[df.Quantity > 0]
#InvoiceDate
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['InvoiceYearMonth'] = df['InvoiceDate'].map(lambda date: 100*date.year + date.month)
df['Date'] = df['InvoiceDate'].dt.strftime('%Y-%m')
#timebound
print(df["InvoiceDate"].min())
print(df["InvoiceDate"].max())
#UnitPrice
df = df[df.UnitPrice > 0]
#CustomerID
df = df[pd.notnull(df['CustomerID'])]
df.info()
#Aggregating the Orders by Month
df_agg= df.groupby("Date").Quantity.sum()
df_agg.head()
#converting series to dataframe and resetting index.
df_agg=pd.DataFrame(df_agg)
df_agg=df_agg.reset_index()
df_agg.head()
def plot_df(df, x, y, title="", xlabel='Date', ylabel='Quantity', dpi=100):
plt.figure(figsize=(16,5), dpi=dpi)
plt.gca().set(title=title, xlabel=xlabel, ylabel=ylabel)
plt.plot(x, y, color='tab:Blue', marker='o')
plt.show()
plot_df(df_agg, x=df_agg.Date, y=df_agg.Quantity,title='Orders in 2011')
#Calculating Revenue
#Revenue = Order Count * Average Revenue per Order
df['Revenue'] = df['Quantity']*df['UnitPrice']
sns.boxplot(y=df['Revenue'])
df.info()
#Monthly Revenue
df_revenue = df.groupby(['InvoiceYearMonth'])['Revenue'].sum().reset_index()
df_revenue.head()
plot_data = [
go.Scatter(
x=df_revenue['InvoiceYearMonth'],
y=df_revenue['Revenue'],
mode='lines+markers'
)
]
plot_layout = go.Layout(
xaxis={"type": "category"},
title='Monthly Revenue'
)
fig = go.Figure(data=plot_data, layout=plot_layout)
pyoff.iplot(fig)
NOW = dt.date(2011,12,9)
df['Date'] = pd.DatetimeIndex(df.InvoiceDate).date
#Recency
df_recency = df.groupby(['CustomerID'],as_index=False)['Date'].max()
df_recency.columns = ['CustomerID','Last_Purchase_Date']
df_recency['Recency'] = df_recency.Last_Purchase_Date.apply(lambda x:(NOW - x).days)
df_recency.drop(columns=['Last_Purchase_Date'],inplace=True)
df_recency.head()
#Frequency - Monetarty
FM_Table = df.groupby('CustomerID').agg({'InvoiceNo' : lambda x:len(x),
'Revenue' : lambda x:x.sum()})
FM_Table.rename(columns = {'InvoiceNo' :'Frequency',
'Revenue':'Monetary'},inplace= True)
FM_Table.head()
RFM_Table = df_recency.merge(FM_Table,left_on='CustomerID',right_on='CustomerID')
RFM_Table.head()
#Heatmap check
sns.heatmap(RFM_Table.corr(), annot=True);
quantiles = RFM_Table.quantile(q=[0.25,0.50,0.75])
quantiles = quantiles.to_dict()
segmented_rfm = RFM_Table.copy()
def RScore(x,p,d):
if x <= d[p][0.25]:
return 1
elif x <= d[p][0.50]:
return 2
elif x <= d[p][0.75]:
return 3
else:
return 4
def FMScore(x,p,d):
if x <= d[p][0.25]:
return 4
elif x <= d[p][0.50]:
return 3
elif x <= d[p][0.75]:
return 2
else:
return 1
segmented_rfm['R_quartile'] = segmented_rfm['Recency'].apply(RScore, args=('Recency',quantiles))
segmented_rfm['F_quartile'] = segmented_rfm['Frequency'].apply(FMScore, args=('Frequency',quantiles))
segmented_rfm['M_quartile'] = segmented_rfm['Monetary'].apply(FMScore, args=('Monetary',quantiles))
segmented_rfm.head()
# RFM_Score = R_quartile + F_quartile + M_quartile
segmented_rfm['RFM_Segment'] = segmented_rfm.R_quartile.map(str)+segmented_rfm.F_quartile.map(str)+segmented_rfm.M_quartile.map(str)
segmented_rfm.head()
segmented_rfm['RFM_Score'] = segmented_rfm[['R_quartile','F_quartile','M_quartile']].sum(axis=1)
segmented_rfm.head()
print("Best Customers: ",len(segmented_rfm[segmented_rfm['RFM_Segment']=='111']))
print('Loyal Customers: ',len(segmented_rfm[segmented_rfm['F_quartile']==1]))
print("Big Spenders: ",len(segmented_rfm[segmented_rfm['M_quartile']==1]))
print('Almost Lost: ', len(segmented_rfm[segmented_rfm['RFM_Segment']=='134']))
print('Lost Customers: ',len(segmented_rfm[segmented_rfm['RFM_Segment']=='344']))
print('Lost Cheap Customers: ',len(segmented_rfm[segmented_rfm['RFM_Segment']=='444']))
# Setting up the label for each client and adding the column "Label" to the dataframe
label = [0] * len(segmented_rfm)
for i in range(0,len(segmented_rfm)):
if segmented_rfm['RFM_Segment'][i] == '111':
label[i] = "Best Customers"
elif segmented_rfm['RFM_Segment'][i] == '134' :
label[i] = "Almost Lost"
elif segmented_rfm['RFM_Segment'][i] == '344':
label[i] = "Lost Customers"
elif segmented_rfm['RFM_Segment'][i] == '444':
label[i] = "Lost Cheap Customers"
elif segmented_rfm['F_quartile'][i] == 1:
label[i] = "Loyal Customers"
elif segmented_rfm['M_quartile'][i] == 1:
label[i] = "Big Spenders"
else:
label[i] = "Others"
# Adding the 'Label' column to our dataframe
segmented_rfm['Label'] = label
# Adding the 'Label' column to our dataframe
segmented_rfm['Label'] = label
import seaborn as sns
sq1=segmented_rfm.groupby('Label')['CustomerID'].nunique().sort_values(ascending=False).reset_index()
plt.figure(figsize=(12,8))
sq1.drop([0],inplace=True)
sns.barplot(data=sq1, x="Label", y="CustomerID", palette = "Greens_d");
# Modeling Data: K-Means Clustering
K-Means clustering algorithm is an unsupervised machine learning algorithm that uses multiple iterations to segment the unlabeled data points into K different clusters in a way such that each data point belongs to only a single group that has similar properties.
RFM_Table_New = RFM_Table.drop('CustomerID', axis=1)
RFM_Table_New.head()
#Heatmap check
sns.heatmap(RFM_Table_New.corr(), annot=True);
#Analyze distribution
sns.distplot(RFM_Table_New['Recency'])
sns.distplot(RFM_Table_New['Frequency'])
sns.distplot(RFM_Table_New['Monetary'])
K-means gives the best result under the following conditions:
df_rfm_log = RFM_Table_New.copy()
df_rfm_log.head()
df_rfm_log = np.log(df_rfm_log+1)
windsoriser = Winsorizer(tail='both', # cap left, right or both tails
fold=2,
variables=[ 'Recency', 'Frequency', 'Monetary']
)
windsoriser.fit(df_rfm_log)
df_rfm_log = windsoriser.transform(df_rfm_log)
Once the skewness is reduced, the data is standardised by centring and scaling. Note all the variables now have a mean of 0 and a standard deviation of 1.
#Developing preprocessing with StandardScaler
scaler = StandardScaler()
scaler.fit(df_rfm_log)
RFM_Table_New_scaled = scaler.transform(df_rfm_log)
RFM_Table_New_scaled = pd.DataFrame(RFM_Table_New_scaled, columns=RFM_Table_New.columns)
RFM_Table_New_scaled.head()
#Analyze distribution
sns.distplot(RFM_Table_New_scaled['Recency'])
sns.distplot(RFM_Table_New_scaled['Frequency'])
sns.distplot(RFM_Table_New_scaled['Monetary'])
#Finding the optimal number of clusters
X = np.asarray(RFM_Table_New_scaled)
Sum_of_squared_distances = []
K = range(1,16)
for k in K:
km = KMeans(n_clusters=k)
km = km.fit(X)
Sum_of_squared_distances.append(km.inertia_)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
def kmeans(normalised_df_rfm, clusters_number, original_df_rfm):
kmeans = KMeans(n_clusters = clusters_number, random_state = 1)
kmeans.fit(normalised_df_rfm)
# Extract cluster labels
cluster_labels = kmeans.labels_
# Create a cluster label column in original dataset
df_new = original_df_rfm.assign(Cluster = cluster_labels)
# Initialise TSNE
model = TSNE(random_state=1)
transformed = model.fit_transform(df_new)
# Plot t-SNE
plt.title('Flattened Graph of {} Clusters'.format(clusters_number))
sns.scatterplot(x=transformed[:,0], y=transformed[:,1], hue=cluster_labels, style=cluster_labels, palette="Set1")
return df_new
plt.figure(figsize=(10, 10))
plt.subplot(3, 1, 1)
df_rfm_k3 = kmeans(RFM_Table_New_scaled, 3, RFM_Table)
plt.subplot(3, 1, 2)
df_rfm_k4 = kmeans(RFM_Table_New_scaled, 4, RFM_Table)
plt.subplot(3, 1, 3)
df_rfm_k5 = kmeans(RFM_Table_New_scaled, 5, RFM_Table)
plt.tight_layout()
def snake_plot(normalised_df_rfm, df_rfm_kmeans, df_rfm_original):
normalised_df_rfm = pd.DataFrame(normalised_df_rfm,
index=RFM_Table.index,
columns=RFM_Table.columns)
normalised_df_rfm['Cluster'] = df_rfm_kmeans['Cluster']
# Melt data into long format
df_melt = pd.melt(normalised_df_rfm.reset_index(),
id_vars=['CustomerID', 'Cluster'],
value_vars=['Recency', 'Frequency', 'Monetary'],
var_name='Metric',
value_name='Value')
plt.xlabel('Metric')
plt.ylabel('Value')
sns.pointplot(data=df_melt, x='Metric', y='Value', hue='Cluster')
return
plt.figure(figsize=(9, 9))
plt.subplot(3, 1, 1)
plt.title('Snake Plot of K-Means = 3')
snake_plot(RFM_Table_New_scaled, df_rfm_k3, RFM_Table)
plt.subplot(3, 1, 2)
plt.title('Snake Plot of K-Means = 4')
snake_plot(RFM_Table_New_scaled, df_rfm_k4, RFM_Table)
plt.subplot(3, 1, 3)
plt.title('Snake Plot of K-Means = 5')
snake_plot(RFM_Table_New_scaled, df_rfm_k5, RFM_Table)
plt.tight_layout()
# Evaluating Model: K-Means Clustering
* Davies Bouldin Score is a metric for evaluating clustering algorithms. The smaller Davies Bouldin Score is The more optimal the cluster.
* K-Means = 3
kmeans = KMeans(n_clusters=3)
kmeans.fit(X)
print(davies_bouldin_score(X, kmeans.labels_))
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
print(davies_bouldin_score(X, kmeans.labels_))
kmeans = KMeans(n_clusters=5)
kmeans.fit(X)
print(davies_bouldin_score(X, kmeans.labels_))
df_rfm_k4.head()
df_rfm_k4.Cluster.value_counts()
import seaborn as sns
sq1=df_rfm_k4.groupby('Cluster')['CustomerID'].nunique().sort_values(ascending=False).reset_index()
plt.figure(figsize=(12,8))
sns.barplot(data=sq1, x="Cluster", y="CustomerID", palette = "Greens_d");
def rfm_values(df):
df_new = df.groupby(['Cluster']).agg({
'Recency': 'mean',
'Frequency': 'mean',
'Monetary': ['mean', 'count']
}).round(0)
return df_new
rfm_values(df_rfm_k4)